knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.align = 'center') knitr::opts_knit$set(root.dir = "/mnt/raid5/Personal/tangchao/project/Nanopore/BarcodeDecomplex")
library(PorexploreR) library(data.table) library(rhdf5) library(ggplot2) library(parallel) library(changepoint) library(caret) library(pbapply) library(GenomicAlignments) library(ggbeeswarm)
readFast5 <- function(file, NT) { reads <- rhdf5::h5ls(file = file, recursive = FALSE)$name parallel::mclapply(X = reads, FUN = function(read) { # h5closeAll() h5_signals <- tryCatch(rhdf5::h5read(file = file, name = paste0(read, "/Raw"), read.attributes = TRUE), error = function(e) NA) if(is.na(h5_signals)) return(NA) signal_meta <- rhdf5::h5read(file, paste0(read, "/channel_id"), read.attributes = TRUE) range <- attr(signal_meta, "range") digitisation <- attr(signal_meta, "digitisation") scaling <- range/digitisation offset <- attr(signal_meta, "offset") sampling_rate <- attr(signal_meta, "sampling_rate") new("Squiggle", raw_signal = as.integer(h5_signals[[1]]), range = as.numeric(range), digitisation = as.integer(digitisation), offset = as.integer(offset), sampling_rate = as.integer(sampling_rate), scaling = as.numeric(scaling)) }, mc.cores = NT) -> res names(res) <- reads return(res) } normalize_signal <- function(sig) { med = median(sig) mad = median(abs(sig - med)) (sig - med) / max(0.01, (mad * 1.4826)) } GetBarcode3 <- function(read, length = 20000, plot = TRUE) { raw_sig <- PorexploreR::signal(read) if(length(raw_sig) > length) raw_sig <- raw_sig[seq_len(length)] Mat <- data.table::data.table(x = seq_along(raw_sig), norm = normalize_signal(raw_sig)) Mat[, dema := smoother::smth(norm, method = "dema", n = 20)] cp2 <- Mat[!is.na(dema), suppressWarnings(changepoint::cpt.meanvar(dema, class = FALSE))[[1]]] + Mat[!is.na(dema), min(x)] ansmean <- tryCatch(suppressWarnings(changepoint::cpt.meanvar(Mat[cp2:nrow(Mat), dema], penalty = "Asymptotic", pen.value = 1e-10, method = "PELT")), error = function(e) NA) if(is.na(ansmean)) return(NULL) whichBin <- which.max(diff(c(0, head(ansmean@cpts, 4)))) polyA_Pos <- ifelse(whichBin == 1, cp2, ansmean@cpts[whichBin - 1] + cp2) polyA_Sig <- ansmean@param.est$mean[whichBin] if(polyA_Pos < 2500) { BCSs <- NULL } else { if(mean(Mat[round(polyA_Pos*0.75):polyA_Pos, dema] < polyA_Sig) > 0.9) { # Mat[1:polyA_Pos, smth := smoother::smth(norm, method = "dema", n = round(polyA_Pos/400), v = 1)] # BCSs <- Mat[!is.na(smth), smth] BCSs <- Mat[1:polyA_Pos, norm] if(plot) { Mat[1:polyA_Pos, smth := smoother::smth(norm, method = "dema", n = round(polyA_Pos/400), v = 1)] Mat[, plot(x, norm, type = "s")] Mat[, lines(x, smth, type = "s", col = 2)] abline(v = polyA_Pos, col = 3, lty = 2, lwd = 3) } } else { if(mean(Mat[round(cp2*0.75):cp2, dema] < polyA_Sig) > 0.9) { BCSs <- Mat[1:cp2, norm] if(plot) { Mat[1:polyA_Pos, smth := smoother::smth(norm, method = "dema", n = round(polyA_Pos/400), v = 1)] Mat[, plot(x, norm, type = "s")] Mat[1:cp2, lines(x, smth, type = "s", col = 2)] abline(v = cp2, col = 3, lty = 2, lwd = 3) } } else { BCSs <- NULL } } } if(!is.null(BCSs)) { if(length(BCSs) < 2100) { BCSs <- NULL } } return(BCSs) } MyChangePoint <- function(sig, MinLength = 10, ChangePoints = 68, StateStat = "Mean") { if(is.null(StateStat) | is.na(StateStat)) { stop("StateStat must be one of Mean or Median") } if(length(StateStat) != 1) { stop("StateStat must be one of Mean or Median") } if(!is.element(StateStat, c("Mean", "Median"))) { stop("StateStat must be one of Mean or Median") } cp0 <- suppressWarnings(changepoint::cpt.meanvar(data = sig, Q = ChangePoints, penalty = "Manual", method = "BinSeg", class = FALSE, minseglen = MinLength, param.estimates = FALSE, pen.value = 0.0001)) - 0.5 bins <- cut(seq_along(sig), c(0, cp0, length(sig)), include.lowest = T, labels = FALSE) if(StateStat == "Mean") { bin_sig <- as.numeric(by(sig, bins, mean)) } else { bin_sig <- as.numeric(by(sig, bins, median)) } return(bin_sig) }
files <- list.files("/mnt/raid61/Personal_data/songjunwei/DRS_RTA/batch4_f5_fq/20211022_1301_MN26652_FAQ89815_36b5e210/fast5_pass", "fast5$", recursive = TRUE, full.names = TRUE)
barcodeSigs <- lapply(files, function(file1) { print(file1) fast5_1 <- readFast5(file = file1, NT = 10) barcode <- mclapply(fast5_1, function(x) GetBarcode3(read = x, plot = F), mc.cores = 10) names(barcode) <- names(fast5_1) barcode[!mapply(is.null, barcode)] }) barcodeSigs <- do.call(c, barcodeSigs)
barcodeSigsBin <- pblapply(FUN = function(x) { MyChangePoint(sig = x, ChangePoints = 98, MinLength = 10, StateStat = "Mean") }, barcodeSigs, cl = 4)
barcodeSigsBinMat <- as.data.frame(do.call(rbind, barcodeSigsBin)) colnames(barcodeSigsBinMat) <- paste0("BIN", sprintf("%03d", seq_len(ncol(barcodeSigsBinMat)))) saveRDS(barcodeSigsBinMat, "./analysis/08.m6A/01.Plasmodium/20211025/01.BarcodeDecomplex/barcodeSigsBinMat.Rds")
barcodeSigsBinMat <- readRDS("./analysis/08.m6A/01.Plasmodium/20211025/01.BarcodeDecomplex/barcodeSigsBinMat.Rds") load(file = "./analysis/07.VirusClassification/03.Merge/01.Classifiers/20211008_6.RData")
Fit1
ROC_Tab <- data.frame(predict(Fit1, barcodeSigsBinMat, type = "prob"), pred = predict(Fit1, newdata = barcodeSigsBinMat)) saveRDS(ROC_Tab, "./analysis/08.m6A/01.Plasmodium/20211025/01.BarcodeDecomplex/BarcodesPrediction.Rds")
Preds <- readRDS("./analysis/08.m6A/01.Plasmodium/20211025/01.BarcodeDecomplex/BarcodesPrediction.Rds") colnames(Preds) <- gsub("\\.", "-", colnames(Preds)) Preds <- as.data.table(Preds, keep.rownames = "qname") Preds$PP <- apply(Preds[, grepl("RTA", colnames(Preds)), with = F], 1, max)
Preds[, .N, pred] ggplot(Preds, aes(x = pred, y = PP)) + geom_violin() + scale_y_continuous(limits = c(0, 1)) + theme_classic(base_size = 15)
Mat <- copy(Preds) lapply(seq(0, 100, 5)/100, function(i) { ClassifiedReads <- Mat[, sum(PP >= i)] ReadsPercent <- Mat[, mean(PP >= i) * 100] data.frame(ClassifiedReads = ClassifiedReads, ClassifiedReadsPercent = ReadsPercent) }) -> Cutoff_Select Cutoff_Select <- as.data.table(do.call(rbind, Cutoff_Select)) Cutoff_Select$Cutoff <- seq(0, 100, 5)/100
ggplot(Cutoff_Select, aes(Cutoff, ClassifiedReadsPercent)) + geom_line() + labs(x = "PP", y = "Percentage of successful reads") + theme_classic() + theme(legend.position = "none", axis.title = element_text(size = 16), axis.text = element_text(size = 12)) + geom_vline(xintercept = 0.3)
Preds[PP > 0.0, .N, pred][order(pred), ] Preds[PP > 0.3, .N, pred][order(pred), ] Preds[PP > 0.5, .N, pred][order(pred), ]
Preds[PP <= 0.3, pred := "Unclassified"]
library(microseq) library(Biostrings) fqfs <- list.files("/mnt/raid61/Personal_data/songjunwei/DRS_RTA/batch4_f5_fq/hac_fastq/pass", "fastq", full.names = TRUE) fastq <- lapply(fqfs, microseq::readFastq) fastq <- do.call(rbind, fastq) id <- fastq$Header fastq <- QualityScaledDNAStringSet(x = RNAStringSet(fastq$Sequence), quality = PhredQuality(fastq$Quality)) names(fastq) <- id
setdiff(Preds[, levels(pred)], "Unclassified") for(i in setdiff(Preds[, levels(pred)], "Unclassified")) { fi <- fastq[paste0("read_", mapply(function(x) x[1], strsplit(names(fastq), " "))) %in% Preds[pred == i, qname]] writeQualityScaledXStringSet(fi, paste0("./analysis/08.m6A/01.Plasmodium/20211025/02.SplitFastq/", i, ".fastq")) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.